import scrapy
import json
from nlproject.items import SingleItem

"""
使用方法:
scrapy crawl list -a xpaths='{"first": "//a", "link": "@href", "title": "text()"}' -a frontUrl='https://tieba.baidu.com/f?kw=%E9%9B%B7%E5%87%8C&ie=utf-8&pn=' -a backUrl='' -a startNum=0 -a endNum=1000 -a increase=50       
curl http://localhost:6800/schedule.json -d project=nlproject -d spider=list -d setting=FEED_FORMAT='csv' -d setting=FEED_URI='/Users/zhaohenry/PycharmProjects/nlspider/nlproject/commond123.csv' -d xpaths='{"first": "//a", "title": "text()", "link": "@href"}' -d frontUrl='https://tieba.baidu.com/f?kw=%E9%9B%B7%E5%87%8C&ie=utf-8&pn=' -d backUrl='' -d startNum=0 -d endNum=1000 -d increase=50  
"""

class ListSpider(scrapy.Spider):
    """
    xpath提取目标
    """
    name = "list"

    def __init__(self, frontUrl, backUrl, startNum, endNum, increase, xpaths, *args, **kwargs):
        """

        :param start_urls: 这里传json化的list
        :param xpaths: 这里传json化的dict
        """
        super(ListSpider, self).__init__(*args, **kwargs)
        # self.resList = []

        start_urls = []
        for pageNum in range(int(startNum), int(endNum)+int(increase), int(increase)):
            start_urls.append(frontUrl+str(pageNum)+backUrl)
        self.start_urls = start_urls  # type: list
        print(self.start_urls)

        xpaths = json.loads(xpaths)
        self.xpaths = xpaths  # type: dict
        self.first_xpath = self.xpaths.pop('first')

        # frontUrl = 'https://tieba.baidu.com/f?kw=%E9%9B%B7%E5%87%8C&ie=utf-8&pn='
        # backUrl = ''
        # startNum = 0
        # endNum = 1000
        # increase = 50

    def parse(self, response):
          # 这个key是固定的
        print('6666666')
        for sel in response.xpath(self.first_xpath):
            tempDic = {}
            for k,v in self.xpaths.items():
                tempDic[k] = sel.xpath(v).extract_first()  # 精确匹配,这里只取第一个结果 TODO: 不确定对结果 是否有影响
            yield tempDic